About the Dataset

The data is about Coronavirus (COVID-19) Vaccinations

The website states that the vaccine dataset is based on the most current official data from international health ministries and governments. The United Nations World Population Prospects serve as the basis for population estimates used in per-capita measurements. Based on the World Bank classification, income groups are created. Source: https://ourworldindata.org/covid-vaccinations

# Load required libraries
library(tidyverse)
library(lares) # show the structure of data
library(vtable) # show the mean, sd, min, max
library(caret) 
library(psycho) # Standardizing
library('DescTools')
library(ggcorrplot)
library(plotly) # Using plotly for interactive plots
library(fastDummies) # for dummy encoding

Loading the dataset

# Loading the data after downloading
data <- read.csv("owid-covid-data.csv")
dim(data)
## [1] 351787     67
# Overview of the data
head(data, 5)
##   iso_code continent    location       date total_cases new_cases
## 1      AFG      Asia Afghanistan 2020-01-03          NA         0
## 2      AFG      Asia Afghanistan 2020-01-04          NA         0
## 3      AFG      Asia Afghanistan 2020-01-05          NA         0
## 4      AFG      Asia Afghanistan 2020-01-06          NA         0
## 5      AFG      Asia Afghanistan 2020-01-07          NA         0
##   new_cases_smoothed total_deaths new_deaths new_deaths_smoothed
## 1                 NA           NA          0                  NA
## 2                 NA           NA          0                  NA
## 3                 NA           NA          0                  NA
## 4                 NA           NA          0                  NA
## 5                 NA           NA          0                  NA
##   total_cases_per_million new_cases_per_million new_cases_smoothed_per_million
## 1                      NA                     0                             NA
## 2                      NA                     0                             NA
## 3                      NA                     0                             NA
## 4                      NA                     0                             NA
## 5                      NA                     0                             NA
##   total_deaths_per_million new_deaths_per_million
## 1                       NA                      0
## 2                       NA                      0
## 3                       NA                      0
## 4                       NA                      0
## 5                       NA                      0
##   new_deaths_smoothed_per_million reproduction_rate icu_patients
## 1                              NA                NA           NA
## 2                              NA                NA           NA
## 3                              NA                NA           NA
## 4                              NA                NA           NA
## 5                              NA                NA           NA
##   icu_patients_per_million hosp_patients hosp_patients_per_million
## 1                       NA            NA                        NA
## 2                       NA            NA                        NA
## 3                       NA            NA                        NA
## 4                       NA            NA                        NA
## 5                       NA            NA                        NA
##   weekly_icu_admissions weekly_icu_admissions_per_million
## 1                    NA                                NA
## 2                    NA                                NA
## 3                    NA                                NA
## 4                    NA                                NA
## 5                    NA                                NA
##   weekly_hosp_admissions weekly_hosp_admissions_per_million total_tests
## 1                     NA                                 NA          NA
## 2                     NA                                 NA          NA
## 3                     NA                                 NA          NA
## 4                     NA                                 NA          NA
## 5                     NA                                 NA          NA
##   new_tests total_tests_per_thousand new_tests_per_thousand new_tests_smoothed
## 1        NA                       NA                     NA                 NA
## 2        NA                       NA                     NA                 NA
## 3        NA                       NA                     NA                 NA
## 4        NA                       NA                     NA                 NA
## 5        NA                       NA                     NA                 NA
##   new_tests_smoothed_per_thousand positive_rate tests_per_case tests_units
## 1                              NA            NA             NA            
## 2                              NA            NA             NA            
## 3                              NA            NA             NA            
## 4                              NA            NA             NA            
## 5                              NA            NA             NA            
##   total_vaccinations people_vaccinated people_fully_vaccinated total_boosters
## 1                 NA                NA                      NA             NA
## 2                 NA                NA                      NA             NA
## 3                 NA                NA                      NA             NA
## 4                 NA                NA                      NA             NA
## 5                 NA                NA                      NA             NA
##   new_vaccinations new_vaccinations_smoothed total_vaccinations_per_hundred
## 1               NA                        NA                             NA
## 2               NA                        NA                             NA
## 3               NA                        NA                             NA
## 4               NA                        NA                             NA
## 5               NA                        NA                             NA
##   people_vaccinated_per_hundred people_fully_vaccinated_per_hundred
## 1                            NA                                  NA
## 2                            NA                                  NA
## 3                            NA                                  NA
## 4                            NA                                  NA
## 5                            NA                                  NA
##   total_boosters_per_hundred new_vaccinations_smoothed_per_million
## 1                         NA                                    NA
## 2                         NA                                    NA
## 3                         NA                                    NA
## 4                         NA                                    NA
## 5                         NA                                    NA
##   new_people_vaccinated_smoothed new_people_vaccinated_smoothed_per_hundred
## 1                             NA                                         NA
## 2                             NA                                         NA
## 3                             NA                                         NA
## 4                             NA                                         NA
## 5                             NA                                         NA
##   stringency_index population_density median_age aged_65_older aged_70_older
## 1                0             54.422       18.6         2.581         1.337
## 2                0             54.422       18.6         2.581         1.337
## 3                0             54.422       18.6         2.581         1.337
## 4                0             54.422       18.6         2.581         1.337
## 5                0             54.422       18.6         2.581         1.337
##   gdp_per_capita extreme_poverty cardiovasc_death_rate diabetes_prevalence
## 1       1803.987              NA               597.029                9.59
## 2       1803.987              NA               597.029                9.59
## 3       1803.987              NA               597.029                9.59
## 4       1803.987              NA               597.029                9.59
## 5       1803.987              NA               597.029                9.59
##   female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand
## 1             NA           NA                 37.746                        0.5
## 2             NA           NA                 37.746                        0.5
## 3             NA           NA                 37.746                        0.5
## 4             NA           NA                 37.746                        0.5
## 5             NA           NA                 37.746                        0.5
##   life_expectancy human_development_index population
## 1           64.83                   0.511   41128772
## 2           64.83                   0.511   41128772
## 3           64.83                   0.511   41128772
## 4           64.83                   0.511   41128772
## 5           64.83                   0.511   41128772
##   excess_mortality_cumulative_absolute excess_mortality_cumulative
## 1                                   NA                          NA
## 2                                   NA                          NA
## 3                                   NA                          NA
## 4                                   NA                          NA
## 5                                   NA                          NA
##   excess_mortality excess_mortality_cumulative_per_million
## 1               NA                                      NA
## 2               NA                                      NA
## 3               NA                                      NA
## 4               NA                                      NA
## 5               NA                                      NA
str(data)
## 'data.frame':    351787 obs. of  67 variables:
##  $ iso_code                                  : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ continent                                 : chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ location                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ date                                      : chr  "2020-01-03" "2020-01-04" "2020-01-05" "2020-01-06" ...
##  $ total_cases                               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_cases                                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases_smoothed                        : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ total_deaths                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths                                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_deaths_smoothed                       : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ total_cases_per_million                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_cases_per_million                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases_smoothed_per_million            : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ total_deaths_per_million                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_per_million                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_deaths_smoothed_per_million           : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ reproduction_rate                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients_per_million                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients                             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients_per_million                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions_per_million         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions_per_million        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests                               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests                                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests_per_thousand                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_per_thousand                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed_per_thousand           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ positive_rate                             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_per_case                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_units                               : chr  "" "" "" "" ...
##  $ total_vaccinations                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_boosters                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_vaccinations_per_hundred            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated_per_hundred             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated_per_hundred       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_boosters_per_hundred                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed_per_million     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_people_vaccinated_smoothed            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_people_vaccinated_smoothed_per_hundred: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ stringency_index                          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ population_density                        : num  54.4 54.4 54.4 54.4 54.4 ...
##  $ median_age                                : num  18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 ...
##  $ aged_65_older                             : num  2.58 2.58 2.58 2.58 2.58 ...
##  $ aged_70_older                             : num  1.34 1.34 1.34 1.34 1.34 ...
##  $ gdp_per_capita                            : num  1804 1804 1804 1804 1804 ...
##  $ extreme_poverty                           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ cardiovasc_death_rate                     : num  597 597 597 597 597 ...
##  $ diabetes_prevalence                       : num  9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 ...
##  $ female_smokers                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ male_smokers                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ handwashing_facilities                    : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ hospital_beds_per_thousand                : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ life_expectancy                           : num  64.8 64.8 64.8 64.8 64.8 ...
##  $ human_development_index                   : num  0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 ...
##  $ population                                : num  41128772 41128772 41128772 41128772 41128772 ...
##  $ excess_mortality_cumulative_absolute      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality_cumulative               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality_cumulative_per_million   : num  NA NA NA NA NA NA NA NA NA NA ...

Checking for missing values

# Check for null values in the dataframe
is_null <- is.na(data)

# Count the number of null values in each column
num_null_values <- colSums(is_null, na.rm = TRUE)

# Print the number of null values in each column
print(num_null_values)
##                                   iso_code 
##                                          0 
##                                  continent 
##                                          0 
##                                   location 
##                                          0 
##                                       date 
##                                          0 
##                                total_cases 
##                                      37991 
##                                  new_cases 
##                                       9611 
##                         new_cases_smoothed 
##                                      10870 
##                               total_deaths 
##                                      59620 
##                                 new_deaths 
##                                       9560 
##                        new_deaths_smoothed 
##                                      10790 
##                    total_cases_per_million 
##                                      37991 
##                      new_cases_per_million 
##                                       9611 
##             new_cases_smoothed_per_million 
##                                      10870 
##                   total_deaths_per_million 
##                                      59620 
##                     new_deaths_per_million 
##                                       9560 
##            new_deaths_smoothed_per_million 
##                                      10790 
##                          reproduction_rate 
##                                     166970 
##                               icu_patients 
##                                     314132 
##                   icu_patients_per_million 
##                                     314132 
##                              hosp_patients 
##                                     312824 
##                  hosp_patients_per_million 
##                                     312824 
##                      weekly_icu_admissions 
##                                     341552 
##          weekly_icu_admissions_per_million 
##                                     341552 
##                     weekly_hosp_admissions 
##                                     328484 
##         weekly_hosp_admissions_per_million 
##                                     328484 
##                                total_tests 
##                                     272400 
##                                  new_tests 
##                                     276384 
##                   total_tests_per_thousand 
##                                     272400 
##                     new_tests_per_thousand 
##                                     276384 
##                         new_tests_smoothed 
##                                     247822 
##            new_tests_smoothed_per_thousand 
##                                     247822 
##                              positive_rate 
##                                     255860 
##                             tests_per_case 
##                                     257439 
##                                tests_units 
##                                          0 
##                         total_vaccinations 
##                                     272338 
##                          people_vaccinated 
##                                     275742 
##                    people_fully_vaccinated 
##                                     279073 
##                             total_boosters 
##                                     304093 
##                           new_vaccinations 
##                                     286321 
##                  new_vaccinations_smoothed 
##                                     169536 
##             total_vaccinations_per_hundred 
##                                     272338 
##              people_vaccinated_per_hundred 
##                                     275742 
##        people_fully_vaccinated_per_hundred 
##                                     279073 
##                 total_boosters_per_hundred 
##                                     304093 
##      new_vaccinations_smoothed_per_million 
##                                     169536 
##             new_people_vaccinated_smoothed 
##                                     169743 
## new_people_vaccinated_smoothed_per_hundred 
##                                     169743 
##                           stringency_index 
##                                     154136 
##                         population_density 
##                                      53139 
##                                 median_age 
##                                      74062 
##                              aged_65_older 
##                                      83763 
##                              aged_70_older 
##                                      76846 
##                             gdp_per_capita 
##                                      79587 
##                            extreme_poverty 
##                                     176357 
##                      cardiovasc_death_rate 
##                                      78952 
##                        diabetes_prevalence 
##                                      65070 
##                             female_smokers 
##                                     147116 
##                               male_smokers 
##                                     149900 
##                     handwashing_facilities 
##                                     218141 
##                 hospital_beds_per_thousand 
##                                     110924 
##                            life_expectancy 
##                                      28126 
##                    human_development_index 
##                                      87343 
##                                 population 
##                                          0 
##       excess_mortality_cumulative_absolute 
##                                     339582 
##                excess_mortality_cumulative 
##                                     339582 
##                           excess_mortality 
##                                     339582 
##    excess_mortality_cumulative_per_million 
##                                     339582
# show the details of the dataset
df_str(data, return = "plot")

As we can see from the above output, there are columns with a very high number of null values. So the first thing would be to drop all the columns whose number of null values is more more than half the total number of rows, or observations.

**Sometimes, R do not read empty strings, and question marks as nulls. So we first convert the Question marks and the empty strings as nulls then check the number of null values.

# Replace question marks with NA in the entire dataframe
data[data == "?"] <- NA

# Replace empty strings with NA in the entire dataframe
data[data == ""] <- NA

# Check for null values in each column
null_counts <- colSums(is.na(data))

# Get columns where more than half of the values are null
columns_to_drop <- names(null_counts[null_counts > nrow(data)/2])

# Drop columns with more than half of the values being null
data <- data[, !names(data) %in% columns_to_drop]

dim(data)
## [1] 351787     35
# Drop rows with NA values using na.omit()
data <- na.omit(data)

# Check for null values in the dataframe
is_null <- is.na(data)

# Count the number of null values in each column
num_null_values <- colSums(is_null, na.rm = TRUE)

# Print the number of null values in each column
print(num_null_values)
##                                   iso_code 
##                                          0 
##                                  continent 
##                                          0 
##                                   location 
##                                          0 
##                                       date 
##                                          0 
##                                total_cases 
##                                          0 
##                                  new_cases 
##                                          0 
##                         new_cases_smoothed 
##                                          0 
##                               total_deaths 
##                                          0 
##                                 new_deaths 
##                                          0 
##                        new_deaths_smoothed 
##                                          0 
##                    total_cases_per_million 
##                                          0 
##                      new_cases_per_million 
##                                          0 
##             new_cases_smoothed_per_million 
##                                          0 
##                   total_deaths_per_million 
##                                          0 
##                     new_deaths_per_million 
##                                          0 
##            new_deaths_smoothed_per_million 
##                                          0 
##                          reproduction_rate 
##                                          0 
##                  new_vaccinations_smoothed 
##                                          0 
##      new_vaccinations_smoothed_per_million 
##                                          0 
##             new_people_vaccinated_smoothed 
##                                          0 
## new_people_vaccinated_smoothed_per_hundred 
##                                          0 
##                           stringency_index 
##                                          0 
##                         population_density 
##                                          0 
##                                 median_age 
##                                          0 
##                              aged_65_older 
##                                          0 
##                              aged_70_older 
##                                          0 
##                             gdp_per_capita 
##                                          0 
##                      cardiovasc_death_rate 
##                                          0 
##                        diabetes_prevalence 
##                                          0 
##                             female_smokers 
##                                          0 
##                               male_smokers 
##                                          0 
##                 hospital_beds_per_thousand 
##                                          0 
##                            life_expectancy 
##                                          0 
##                    human_development_index 
##                                          0 
##                                 population 
##                                          0
View(data)

We can now see that we do not have any null values in the columns.

Dropping all the Null Values

# Drop rows with NA values using na.omit()
data_clean <- na.omit(data)

# checking the shape of the data
cat('The shape of the data:', dim(data_clean)[1], 'rows/observations', 'and', dim(data_clean)[2], 'columns')
## The shape of the data: 80318 rows/observations and 35 columns

The Dataset is already eligible for our analysis as it means the eligibility of 7000 and 10 columns

Removing Unwanted Columns

# Removing a single column using subset() function
data <- subset(data, select = -c(population, human_development_index, life_expectancy, hospital_beds_per_thousand, male_smokers, female_smokers,
diabetes_prevalence, cardiovasc_death_rate, gdp_per_capita, aged_70_older, aged_65_older, median_age, population_density, stringency_index, new_people_vaccinated_smoothed_per_hundred, new_vaccinations_smoothed_per_million, new_deaths_smoothed_per_million, new_deaths_per_million, total_deaths_per_million, new_cases_smoothed_per_million, new_cases_per_million, total_cases_per_million))

dim(data)
## [1] 80318    13

A. Identify which variables are categorical, discrete and continuous in the chosen data set and show using some visualization or plot. Explore whether there are missing values for any of the variables.

# Remove all instances of 0 from all columns
data_no_zeros <- data
data_no_zeros[data_no_zeros == 0] <- NA
data <- na.omit(data_no_zeros)

# Check data types of variables
str(data)
## 'data.frame':    43428 obs. of  13 variables:
##  $ iso_code                      : chr  "ALB" "ALB" "ALB" "ALB" ...
##  $ continent                     : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ location                      : chr  "Albania" "Albania" "Albania" "Albania" ...
##  $ date                          : chr  "2021-01-11" "2021-01-12" "2021-01-13" "2021-01-14" ...
##  $ total_cases                   : num  63033 63595 63971 64627 65334 ...
##  $ new_cases                     : num  655 562 376 656 707 660 641 581 474 292 ...
##  $ new_cases_smoothed            : num  577 594 621 621 618 ...
##  $ total_deaths                  : num  1233 1241 1247 1252 1256 ...
##  $ new_deaths                    : num  3 8 6 5 4 5 4 5 7 4 ...
##  $ new_deaths_smoothed           : num  6.14 6.86 6.86 6 5.57 ...
##  $ reproduction_rate             : num  1.06 1.07 1.07 1.07 1.07 1.07 1.07 1.08 1.09 1.1 ...
##  $ new_vaccinations_smoothed     : num  64 64 63 66 62 62 58 55 51 47 ...
##  $ new_people_vaccinated_smoothed: num  64 64 63 66 62 62 58 55 51 47 ...
##  - attr(*, "na.action")= 'omit' Named int [1:36890] 18 19 20 21 22 23 138 139 144 145 ...
##   ..- attr(*, "names")= chr [1:36890] "3176" "3177" "3178" "3179" ...
# show the details of the dataset
df_str(data, return = "plot")

Identify which variables are categorical, discrete and continuous in the chosen data set

-Categorical Variables

  • iso_code
  • continent
  • location
  • date

-Discrete Variables

  • total_cases
  • new_cases
  • total_deaths
  • new_deaths
  • new_vaccinations_smoothed
  • new_people_vaccinated_smoothed

-Continous Variables

  • new_cases_smoothed
  • new_deaths_smoothed
  • reproduction_rate

B. Calculate the statistical parameters (mean, median, minimum, maximum, and standard deviation) for each of the numerical variables.

# Select numerical columns using indexing
numerical_data <- data[sapply(data, is.numeric)]

numerical_data  %>%
      pivot_longer(everything()) %>%
      group_by(name) %>%
      summarise_at(vars(value), list(Mean = mean, Median = median, Min = min,  Max = max, Sd = sd))
## # A tibble: 9 × 6
##   name                                  Mean    Median     Min       Max      Sd
##   <chr>                                <dbl>     <dbl>   <dbl>     <dbl>   <dbl>
## 1 new_cases                        13950.      1545      1        6.97e6 1.04e+5
## 2 new_cases_smoothed               11284.      1497.     0.143    5.88e6 8.82e+4
## 3 new_deaths                         101.        14      1        1.14e4 3.30e+2
## 4 new_deaths_smoothed                 90.1       13.1    0.143    4.19e3 2.83e+2
## 5 new_people_vaccinated_smoothed   87454.      7177      1        6.79e6 3.77e+5
## 6 new_vaccinations_smoothed       214364.     25939      1        2.24e7 8.55e+5
## 7 reproduction_rate                    0.994      0.97  -0.03     4.22e0 2.95e-1
## 8 total_cases                    4084958.    839881    129        9.94e7 9.96e+6
## 9 total_deaths                     61070.     12374.     1        1.08e6 1.40e+5
# getting the summary statistics
st(numerical_data)
Summary Statistics
Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
total_cases 43428 4084958 9958820 129 278548 3488102 99411696
new_cases 43428 13950 103615 1 406 6422 6966046
new_cases_smoothed 43428 11284 88184 0.14 410 5975 5882129
total_deaths 43428 61070 140487 1 3583 36870 1082456
new_deaths 43428 101 330 1 4 56 11447
new_deaths_smoothed 43428 90 283 0.14 3.6 54 4190
reproduction_rate 43428 0.99 0.29 -0.03 0.81 1.1 4.2
new_vaccinations_smoothed 43428 214364 854845 1 5654 120608 22424286
new_people_vaccinated_smoothed 43428 87454 377164 1 1042 37637 6785334

C. Apply Min-Max Normalization, Z-score Standardization and Robust scalar on the numerical data variables.

Min-Max Normalization

# Apply Min-Max Normalization
preproc <- preProcess(numerical_data, method = c("range"))
scaled_data_minmax <- predict(preproc, numerical_data)
head(scaled_data_minmax, 10)
##       total_cases    new_cases new_cases_smoothed total_deaths   new_deaths
## 3159 0.0006327634 9.388398e-05       9.814237e-05  0.001138154 0.0001747335
## 3160 0.0006384167 8.053350e-05       1.009352e-04  0.001145544 0.0006115674
## 3161 0.0006421989 5.383255e-05       1.055740e-04  0.001151087 0.0004368338
## 3162 0.0006487977 9.402753e-05       1.054768e-04  0.001155706 0.0003494671
## 3163 0.0006559096 1.013488e-04       1.050397e-04  0.001159402 0.0002621003
## 3164 0.0006625487 9.460174e-05       1.041410e-04  0.001164021 0.0003494671
## 3165 0.0006689966 9.187423e-05       1.033639e-04  0.001167716 0.0002621003
## 3166 0.0006748410 8.326102e-05       1.015666e-04  0.001172335 0.0003494671
## 3167 0.0006796090 6.790080e-05       9.942948e-05  0.001178802 0.0005242006
## 3168 0.0006825463 4.177406e-05       9.738941e-05  0.001182497 0.0002621003
##      new_deaths_smoothed reproduction_rate new_vaccinations_smoothed
## 3159         0.001432030         0.2564706              2.809454e-06
## 3160         0.001602441         0.2588235              2.809454e-06
## 3161         0.001602441         0.2588235              2.764860e-06
## 3162         0.001397900         0.2588235              2.898643e-06
## 3163         0.001295510         0.2588235              2.720265e-06
## 3164         0.001261618         0.2588235              2.720265e-06
## 3165         0.001159228         0.2588235              2.541887e-06
## 3166         0.001227488         0.2611765              2.408104e-06
## 3167         0.001193358         0.2635294              2.229725e-06
## 3168         0.001125098         0.2658824              2.051347e-06
##      new_people_vaccinated_smoothed
## 3159                   9.284732e-06
## 3160                   9.284732e-06
## 3161                   9.137356e-06
## 3162                   9.579486e-06
## 3163                   8.989979e-06
## 3164                   8.989979e-06
## 3165                   8.400472e-06
## 3166                   7.958342e-06
## 3167                   7.368835e-06
## 3168                   6.779328e-06

Z-score Standardization

# Z-score standardization
z_score_standardized_data <- as.data.frame(scale(numerical_data))
head(z_score_standardized_data, 10)
##      total_cases  new_cases new_cases_smoothed total_deaths new_deaths
## 3159  -0.4038556 -0.1283085         -0.1214125   -0.4259262 -0.2968933
## 3160  -0.4037992 -0.1292061         -0.1212263   -0.4258692 -0.2817319
## 3161  -0.4037614 -0.1310012         -0.1209168   -0.4258265 -0.2877964
## 3162  -0.4036955 -0.1282989         -0.1209233   -0.4257909 -0.2908287
## 3163  -0.4036245 -0.1278067         -0.1209525   -0.4257624 -0.2938610
## 3164  -0.4035583 -0.1282603         -0.1210124   -0.4257269 -0.2908287
## 3165  -0.4034939 -0.1284437         -0.1210642   -0.4256984 -0.2938610
## 3166  -0.4034356 -0.1290227         -0.1211841   -0.4256628 -0.2908287
## 3167  -0.4033880 -0.1300554         -0.1213267   -0.4256130 -0.2847641
## 3168  -0.4033587 -0.1318119         -0.1214628   -0.4255845 -0.2938610
##      new_deaths_smoothed reproduction_rate new_vaccinations_smoothed
## 3159          -0.2971834         0.2249132                -0.2506890
## 3160          -0.2946569         0.2588510                -0.2506890
## 3161          -0.2946569         0.2588510                -0.2506902
## 3162          -0.2976894         0.2588510                -0.2506867
## 3163          -0.2992074         0.2588510                -0.2506914
## 3164          -0.2997099         0.2588510                -0.2506914
## 3165          -0.3012279         0.2588510                -0.2506960
## 3166          -0.3002159         0.2927888                -0.2506996
## 3167          -0.3007219         0.3267265                -0.2507042
## 3168          -0.3017339         0.3606643                -0.2507089
##      new_people_vaccinated_smoothed
## 3159                     -0.2317030
## 3160                     -0.2317030
## 3161                     -0.2317056
## 3162                     -0.2316977
## 3163                     -0.2317083
## 3164                     -0.2317083
## 3165                     -0.2317189
## 3166                     -0.2317268
## 3167                     -0.2317374
## 3168                     -0.2317480

Robust scalar

# Robust Scalar
robust_scalar <- function(x){(x- median(x)) /(quantile(x,probs = .75)-quantile(x,probs = .25))}
robust_scalar_data <- as.data.frame(sapply(numerical_data, robust_scalar))
head(robust_scalar_data, 10)
##    total_cases  new_cases new_cases_smoothed total_deaths new_deaths
## 1   -0.2420424 -0.1479388         -0.1652539   -0.3347128 -0.2115385
## 2   -0.2418673 -0.1633976         -0.1623023   -0.3344724 -0.1153846
## 3   -0.2417501 -0.1943152         -0.1574000   -0.3342922 -0.1538462
## 4   -0.2415457 -0.1477726         -0.1575028   -0.3341420 -0.1730769
## 5   -0.2413254 -0.1392952         -0.1579647   -0.3340218 -0.1923077
## 6   -0.2411198 -0.1471077         -0.1589144   -0.3338716 -0.1730769
## 7   -0.2409201 -0.1502660         -0.1597356   -0.3337514 -0.1923077
## 8   -0.2407391 -0.1602394         -0.1616351   -0.3336012 -0.1730769
## 9   -0.2405914 -0.1780253         -0.1638936   -0.3333909 -0.1346154
## 10  -0.2405004 -0.2082779         -0.1660496   -0.3332707 -0.1923077
##    new_deaths_smoothed reproduction_rate new_vaccinations_smoothed
## 1             -0.14000         0.2647059                -0.2250905
## 2             -0.12572         0.2941176                -0.2250905
## 3             -0.12572         0.2941176                -0.2250992
## 4             -0.14286         0.2941176                -0.2250731
## 5             -0.15144         0.2941176                -0.2251079
## 6             -0.15428         0.2941176                -0.2251079
## 7             -0.16286         0.2941176                -0.2251427
## 8             -0.15714         0.3235294                -0.2251688
## 9             -0.16000         0.3529412                -0.2252036
## 10            -0.16572         0.3823529                -0.2252384
##    new_people_vaccinated_smoothed
## 1                      -0.1943695
## 2                      -0.1943695
## 3                      -0.1943968
## 4                      -0.1943148
## 5                      -0.1944241
## 6                      -0.1944241
## 7                      -0.1945334
## 8                      -0.1946154
## 9                      -0.1947247
## 10                     -0.1948340

D. Line, Scatter and Heatmaps can be used to show the correlation between the features of the dataset.

Line Plots

Total Cases vs Total Deaths

# Basic line plot
ggplot(data=data, aes(x=total_cases, y=total_deaths))+
  geom_line(color= 'black') +
  labs(title = "Scatter Plot of Total Cases vs Total Deaths")

Line Chart of New Deaths and New Deaths Smoothed

# Convert 'date' column to Date type if it's not already
data$date <- as.Date(data$date)

# Plot 'new deaths' and 'new deaths smoothed' against dates with different colors for each line
ggplot(data, aes(x = date)) +
  geom_line(aes(y = new_deaths, color = "New deaths")) +
  geom_line(aes(y = new_deaths_smoothed, color = "New deaths smoothed")) +
  scale_color_manual(values = c("New deaths" = "blue", "New deaths smoothed" = "green")) +
  labs(title = "New Deaths and New Deaths Smoothed Over Time", x = "Date", y = "Number of Cases", color = "Legend") +
  theme_minimal()

Line Chart of Total Cases VS Total Deaths With Time

# Convert 'date' column to Date type if it's not already
ggplot(data, aes(x = date)) +
  geom_line(aes(y = total_cases, color = "total cases")) +
  geom_line(aes(y = total_deaths, color = "total deaths")) +
  scale_color_manual(values = c("total cases" = "red", "total deaths" = "green")) +
  labs(title = "Total Cases and Total Deaths Over Time", x = "Date", y = "Number of Cases", color = "Legend") +
  theme_minimal()

Scatter Plots

Scatter Plot of Total Deaths Vs Total Cases

# Plot scatter plot of 'total_cases' vs 'total_deaths' with different colors for each data point
ggplot(data, aes(x = total_cases, y = total_deaths, color = factor(continent))) +
  geom_point(size = 0.5) +
  labs(title = "Scatter Plot of Total Cases vs Total Deaths", 
       x = "Total Cases", y = "Total Deaths", color = "Continent") +
  theme_minimal()

Heatmaps can be used to show the correlation between the features of the dataset

# Compute a correlation matrix
corr <- round(cor(numerical_data), 1)

# Compute a matrix of correlation p-values
p.mat <- cor_pmat(numerical_data)

# Reordering the correlation matrix using hierarchical clustering
ggcorrplot(corr, hc.order = TRUE)

# Barring the no significant coefficient
ggcorrplot(corr, hc.order = TRUE, p.mat = p.mat)

# Calculate the correlation matrix
correlation_matrix <- cor(numerical_data)

# Reshape the correlation matrix for plotting
correlation_data <- as.data.frame(as.table(correlation_matrix))
colnames(correlation_data) <- c("Feature1", "Feature2", "Correlation")

# Create a heatmap of the correlation matrix with rotated x-axis labels
ggplot(data = correlation_data, aes(x = Feature1, y = Feature2, fill = Correlation)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "darkblue") +
  labs(title = "Correlation Heatmap", x = "Feature 1", y = "Feature 2") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

E. Graphics and descriptive understanding should be provided along with Data Exploratory analysis (EDA). Identify subgroups of features that can explore some interesting facts.

Total Deaths Per Continent

# Group data by continent and calculate total deaths for each continent
continent_deaths <- data %>%
  group_by(continent) %>%
  summarise(total_deaths = sum(total_deaths, na.rm = TRUE))

# Create a bar chart of total deaths by continent
ggplot(data = continent_deaths, aes(x = continent, y = total_deaths, fill = continent)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Deaths by Continent", x = "Continent", y = "Total Deaths") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better readability

Total Cases Per Cotinent

# Group data by continent and calculate total cases for each continent
continent_cases <- data %>%
  group_by(continent) %>%
  summarise(total_cases = sum(total_cases, na.rm = TRUE))

# Create a ber chart of total cases by continent
ggplot(data = continent_cases, aes(x = continent, y = total_cases, fill = continent)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Cases by Continent", x = "Continent", y = "Total Cass") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better readability

# Manually specify colors for the continents
continent_colors <- c("Asia" = "red", "Africa" = "orange", "Europe" = "green",
                      "North America" = "blue", "Oceania" = "purple", "South America" = "yellow")

# Plot line chart of 'total_cases' over time per continent with manual colors and increased transparency
ggplot(data, aes(x = date, y = total_cases, color = continent)) +
  geom_line(alpha = 0.5) +   # Set the alpha value (transparency) to 0.5
  scale_color_manual(values = continent_colors) +
  labs(title = "Total Cases Over Time by Continenet", x = "Date", y = "Total Cases", color = "Continent") +
  theme_minimal()

ggplot(data, aes(x = date, y = total_cases, color = continent, linetype = continent)) +
  geom_line() +
  scale_color_manual(values = continent_colors) +
  scale_linetype_manual(values = c("Asia" = "solid", "Africa" = "dashed", "Europe" = "dotted", 
                      "North America" = "dotdash", "Oceania" = "longdash", "South America" = "twodash")) +
labs(title = "Total Cases Over Time by Continenet", x = "Date", y = "Total Cases", color = "Continent") +
  theme_minimal()

ggplot(data, aes(x = date, y = total_cases, color = continent)) +
  geom_line() +
  scale_color_manual(values = continent_colors) +
  facet_wrap(~continent, scales = "free_y") +
  labs(title = "Total Cases Over Time by Continenet", x = "Date", y = "Total Cases", color = "Continent") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot_ly(data, x = ~date, y = ~total_cases, color = ~continent, type = "scatter", mode = "lines") %>%
  layout(title = "Total Cases Over Time by Continent", xaxis = list(title = "Date"),
         yaxis = list(title = "Total Cases"), legend = list(title = "Continent"))

Total Deaths Per Countries Per Continent

European Countries

# Filer data for Europe
europe_data <- data %>%
  filter(continent == "Europe")

# Bar plot for Europe
ggplot(europe_data, aes(x = location, y = total_deaths, fill = location)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis_d() +  # Use viridis color scale
  labs(title = "Total Deaths in European Countries", x = "Countries", y = "Tota Deaths") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(legend.position = "none")  # Remove the legend

Asian Countries

# Filter data for Asia
asia_data <- data %>%
  filter(continent == "Asia")

# Bar plot for Asia
ggplot(asia_data, aes(x = location, y = total_deaths, fill = location)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis_d() +    # Use viridis color scale
  labs(title = "Total Deaths in Asian Countries", x = "Countries", y = "Total Deaths") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(legend.position = "none")   # Remove the legend

African Coutries

# Filter data for Africa
africa_data <- data %>%
  filter(continent == "Africa")

# Bar plot for Africa
ggplot(africa_data, aes(x = location, y = total_deaths, fill = location)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis_d() +    # Use viridis color scale
  labs(title = "Total Deaths in African Countries", x = "Countries", y = "Total Deaths") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(legend.position = "none")   # Remove the legend

Oceania Countries

# Filter data for Oceania
oceania_data <- data %>%
  filter(continent == "Oceania")

# Bar plot for Oceania
ggplot(oceania_data, aes(x = location, y = total_deaths, fill = location)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis_d() +    # Use viridis color scale
  labs(title = "Total Deaths in Oceania Countries", x = "Countries", y = "Total Deaths") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")   # Remove the legend

South America Countries

# Filter data for South America
south.america_data <- data %>%
  filter(continent == "South America")

# Bar plot for South America
ggplot(south.america_data, aes(x = location, y = total_deaths, fill = location)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis_d() +    # Use viridis color scale
  labs(title = "Total Deaths in South American Countries", x = "Countries", y = "Total Deaths") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")   # Remove the legend

North America Countries

# Filter data for North America
north.america_data <- data %>%
  filter(continent == "North America")

# Bar plot for North America
ggplot(north.america_data, aes(x = location, y = total_deaths, fill = location)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis_d() +    # Use viridis color scale
  labs(title = "Total Deaths in North American Countries", x = "Countries", y = "Total Deaths") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")   # Remove the legend

F. Apply dummy encoding to categorical variables (at least one variable used from the data set) and discuss the benefits of dummy encoding to understand the categorical data.

# Selects all columns that are factors
categorical_data <- select_if(data, is.character)

# Create dummy variables of continent
categorical_data <- dummy_cols(categorical_data, select_columns = c("continent"), remove_selected_columns = TRUE)

# Drop unuseful columns
categorical_data <- subset(categorical_data, select = -c(iso_code, location))
head(categorical_data, 10)
##    continent_Africa continent_Asia continent_Europe continent_North America
## 1                 0              0                1                       0
## 2                 0              0                1                       0
## 3                 0              0                1                       0
## 4                 0              0                1                       0
## 5                 0              0                1                       0
## 6                 0              0                1                       0
## 7                 0              0                1                       0
## 8                 0              0                1                       0
## 9                 0              0                1                       0
## 10                0              0                1                       0
##    continent_Oceania continent_South America
## 1                  0                       0
## 2                  0                       0
## 3                  0                       0
## 4                  0                       0
## 5                  0                       0
## 6                  0                       0
## 7                  0                       0
## 8                  0                       0
## 9                  0                       0
## 10                 0                       0

G Apply PCA with your chosen number of components. Write up a short profile of the first few components extracted based on your understandin.

# With Min - Max Scaling
# Using cbind() to combine data frames column-wise
minmax_df <- cbind(categorical_data, scaled_data_minmax)

# Apply PCA with your chosen number of components (e.g., 2 components)
num_components <- 3
pca_minmax_df <- prcomp(minmax_df, center = TRUE, scale. = TRUE)
minmax_df_pca <- as.data.frame(pca_minmax_df$x[, 1:num_components])
head(minmax_df_pca, 10)
##          PC1        PC2        PC3
## 1  0.9328831 -0.7438877 -0.2006551
## 2  0.9261213 -0.7465153 -0.2014218
## 3  0.9288748 -0.7452462 -0.2016479
## 4  0.9305913 -0.7454615 -0.1994950
## 5  0.9322660 -0.7450984 -0.1987576
## 6  0.9313620 -0.7452897 -0.1992848
## 7  0.9332195 -0.7447004 -0.1989231
## 8  0.9319060 -0.7456301 -0.1982743
## 9  0.9301392 -0.7466711 -0.1979726
## 10 0.9347968 -0.7454595 -0.1964882
# Using cbind() to combine data frames column-wise
z_score_df <- cbind(categorical_data, z_score_standardized_data)

# Apply PCA with your chosen number of components (e.g., 2 components)
num_components <- 3
pca_z_score_df <- prcomp(z_score_df, center = TRUE, scale. = TRUE)
z_score_df_pca <- as.data.frame(pca_z_score_df$x[, 1:num_components])
head(z_score_df_pca, 10)
##          PC1        PC2        PC3
## 1  0.9328831 -0.7438877 -0.2006551
## 2  0.9261213 -0.7465153 -0.2014218
## 3  0.9288748 -0.7452462 -0.2016479
## 4  0.9305913 -0.7454615 -0.1994950
## 5  0.9322660 -0.7450984 -0.1987576
## 6  0.9313620 -0.7452897 -0.1992848
## 7  0.9332195 -0.7447004 -0.1989231
## 8  0.9319060 -0.7456301 -0.1982743
## 9  0.9301392 -0.7466711 -0.1979726
## 10 0.9347968 -0.7454595 -0.1964882
# Using cbind() to combine data frames column-wise
robust_scalar_df <- cbind(categorical_data, robust_scalar_data)

# Apply PCA with your chosen number of components (e.g., 2 components)
num_components <- 3
pca_robust_scalar_df <- prcomp(robust_scalar_df, center = TRUE, scale. = TRUE)
robust_scalar_df_pca <- as.data.frame(pca_robust_scalar_df$x[, 1:num_components])
head(robust_scalar_df_pca, 10)
##          PC1        PC2        PC3
## 1  0.9328831 -0.7438877 -0.2006551
## 2  0.9261213 -0.7465153 -0.2014218
## 3  0.9288748 -0.7452462 -0.2016479
## 4  0.9305913 -0.7454615 -0.1994950
## 5  0.9322660 -0.7450984 -0.1987576
## 6  0.9313620 -0.7452897 -0.1992848
## 7  0.9332195 -0.7447004 -0.1989231
## 8  0.9319060 -0.7456301 -0.1982743
## 9  0.9301392 -0.7466711 -0.1979726
## 10 0.9347968 -0.7454595 -0.1964882

Contribution Per Person

Chia Hua Lin(Awa)

# Data for tasks and days allocated of Awa
tasks <- c('Days Taken to Code', 'Days Taken to Solve Bugs', 'Days Taken to Write Report', 'Days Taken to Proofread')
Ruby <- c(10, 7, 8, 2)
Awa <- c(9, 9, 8, 2)

# Custom colors for each person
custom_colors <- c("Awa" = "#66c2a5", "Ruby" = "#fc8d62")

# Create a data frame
effort_data <- data.frame(tasks, Awa, Ruby)

# Reshape the data to long format for plotting
effort_data_long <- effort_data %>%
  gather(key = "Person", value = "Days", -tasks)

# Create a grouped bar chart with custom colors for the legend
ggplot(effort_data_long, aes(x = tasks, y = Days, fill = Person)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  ggtitle("Total Days Allocated to Each Task") +
  xlab("Task") +
  ylab("Total Days") +
  scale_fill_manual(values = custom_colors) +  # Specify custom colors for the legend
  theme_classic() +
  theme(legend.position = "top")